Garbo

home *** CD-ROM | disk | FTP | other *** search

/ Garbo / Garbo.cdr / mac / hypercrd / hc1_2_x / fretext1.sit / Free Text Source Code / indexFile 0.72.c < prev next >

Wrap

Text File | 1990-01-28 | 41KB | 1,668 lines

/* new indexFile XFCN for Free Text projects * copyright 1990 - Mark Zimmermann * * call the XFCN as 'indexFile()' ... it returns with nothing * if all goes well, and an error message (if possible) otherwise... * * For further information, write: * Mark ^Zimmermann * P.O.Box 8310 * Silver Spring, MD 20907 * USA * * Be sure to enclose a STAMPED, SELF-ADDRESSED ENVELOPE if you want * to receive a timely reply! * * Electronic addresses: * science@nems.dt.navy.mil * [75066,2044] CompuServe */ /* header files to include... */ #include <MacTypes.h> #include <EventMgr.h> #include <OSUtil.h> #include <FileMgr.h> #include <WindowMgr.h> #include <StdFilePkg.h> #include <HyperXCmd.h> #include <SetUpA4.h> /* sort on 28 characters -- long enough to avoid truncation of most real * words, and short enough to save some space in the *.k files */ #define KEY_LENGTH 28 /* my standard structure for the index_keys file */ typedef struct { char kkey[KEY_LENGTH]; long ccount; } KEY_REC; /* my standard structure for simple I/O buffers ... */ typedef struct { char *zbufbase; char *zbufp; long zbufcounter; long zbufsize; int zbuffilep; int zbufitemsize; } ZBUF; /* merge this many files at each stage of the merging operation for index * building ... 2 means a binary merge, etc. ... one needs to have at least * 5 + 2 * NMERGE I/O buffers around: for each of NMERGE files, there is * a *.k keys file and a *.p pointers file; plus there must be a single * output *.k and a single output *.p file; plus there is the need for stdin, * stdout, and stderr to be open as well. Thus, I have found that a 4-way * merge (NMERGE = 4) works pretty nicely.... */ #define NMERGE 4 #ifndef TRUE #define TRUE 1 #endif #ifndef FALSE #define FALSE 0 #endif #ifndef NULL #define NULL 0 #endif #ifndef EOF #define EOF (-1) #endif /* --------------function prototypes ----------------------- */ pascal void main (XCmdBlockPtr paramPtr); int zGetFile (Str255 *fileNamePtr, int *volRefNumPtr); void returnErrorMsg (XCmdBlockPtr paramPtr, char *msg); void give_msg (char *msg); void beepWait (void); long atol (char *s); int putNum (char *ans, long num); int strlen (char *s); char *strcpy (char *s1, char *s2); void create_zbuffer (int n, long bufsize, int buffile, int bufitemsize, ZBUF zbuffer[]); char *next_input_item (int n, ZBUF zbuffer[]); void load_zbuffer (int n, ZBUF zbuffer[]); char *next_output_item (int n, ZBUF zbuffer[]); void flush_zbuffer (int n, ZBUF zbuffer[]); int build_indices (long zbufsiz, int doc_file, int vRef0, int pass_number); void init_filter_table (void); char *make_buf (long bufsiz); long load_doc_buffer (char *doc, long doc_bufsiz, char **ptr, int doc_file); int zgetc (int refNum); char *strncpy(char *s1, char *s2, int n); void nway_merge_kpfiles (int ink[], int inp[], int outk, int outp, int n, long zbufsiz); void copy_ptr_recs (int inzbuf, long count, int outzbuf, ZBUF zbuffer[]); void copy_key_rec (char *kkey, long ccount, int outzbuf, ZBUF zbuffer[]); int merge_not_finished (KEY_REC *kr[], int n); int smallest_key (KEY_REC *kr[], int n); int merge_indices (long zbufsiz, int doc_file, int vRef0, int file_number, int generation_number, Str255 doc_filename); long write_sorted_files (char *doc, char **ptr, long nwords, long offset, long zbufsiz, int pass_number, int vRef0); int is_new_word (char *w0, char *w1); void write_new_key (char *p, char *kp); long file_size (int refnum); int open_inkfile (int file_num, int vRef0, int generation_number); int open_inpfile (int file_num, int vRef0, int generation_number); void fix_oddball_file_name (int vRef0, int generation_number, int file_number); void fix_final_file_names (Str255 doc_filename, int vRef0, int generation_number); int open_outkfile (int vRef0, int generation_number, int file_number); int open_outpfile (int vRef0, int generation_number, int file_number); void remove_used_infiles (int n, int vRef0, int generation_number, int file_number); void close_infiles (int ink[], int inp[], int n); int open_kfile (int vRef0, int pass_number); int open_pfile (int vRef0, int pass_number); long zftell (int refnum); void zqsort (char **first, char **last); int compare_ptrs (char **p1, char **p2); int zstrcmp (char *s1, char *s2); /* standard table to filter lower-case characters to capitals (if possible) * for sorting into alphabetical order ... my best estimates of what to * do for foreign characters in the Apple character set ... consider * using international utilities someday to do the sorting better... */ char filter_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0, 0, 0, 0, 0, 0, 0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 0, 0, 0, 0, 0, 0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 0, 0, 0, 0, 0, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0xCB,0x89,0x80,0xCC,0x81,0x82,0x83,0x8F, 0x90,0x91,0x92,0x93,0x94,0x95,0x84,0x97, 0x98,0x99,0x85,0xCD,0x9C,0x9D,0x9E,0x86, 0, 0, 0, 0, 0, 0, 0,0xA7, 0, 0, 0, 0, 0, 0,0xAE,0xAF, 0, 0, 0, 0, 0,0xB5,0xC6,0xB7, 0xB8,0xB8, 0,0xBB,0xBC,0xBD,0xAE,0xAF, 0, 0, 0, 0, 0, 0,0xC6, 0, 0, 0, 0,0xCB,0xCC,0xCD,0xCE,0xCE, 0, 0, 0, 0, 0, 0, 0, 0, 0xD8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } ; /* ----------------------main program------------------------- */ pascal void main (paramPtr) XCmdBlockPtr paramPtr; { Str255 doc_filename; char info[128]; Ptr testzbufsiz; WindowRecord w_record; WindowPtr info_window; Rect b_rect; Handle answer; int doc_file, vRef0, pass_number, generation_number, prev_gen_number, file_number, i; long zbufsiz, doc_filesize, grow; RememberA0(); SetUpA4(); if (paramPtr->paramCount != 0) { returnErrorMsg (paramPtr, "{Sorry, wrong number of parameters in indexFile call!}"); RestoreA4(); return; } /* see how much memory we can get for our zbuffers ... */ MaxApplZone(); zbufsiz = MaxMem (&grow) - 32768; zbufsiz /= (2 * NMERGE + 2); zbufsiz = zbufsiz - zbufsiz % (sizeof(KEY_REC) * sizeof(long)); if (zbufsiz < sizeof(KEY_REC) * sizeof(long) || (testzbufsiz = NewPtr (zbufsiz * (2 * NMERGE + 2))) == NULL) { returnErrorMsg (paramPtr, "{Sorry, not enough free memory to build an index!}"); RestoreA4(); return; } DisposPtr (testzbufsiz); /* set up a window to give user feedback during indexing */ b_rect.top = 30; b_rect.left = 12; b_rect.bottom = 332; b_rect.right = 500; info_window = NewWindow (&w_record, &b_rect, "\p", (Boolean)1, dBoxProc, (WindowPtr)-1, (Boolean)0, (long)0); ShowWindow (info_window); SetPort (info_window); TextFont (0); give_msg ("\pGreetings! Please choose a file to be indexed..."); if (! zGetFile (&doc_filename, &vRef0)) { CloseWindow (info_window); RestoreA4(); return; } i = FSOpen (doc_filename, vRef0, &doc_file); if (i != noErr && i != opWrErr) { give_msg ("\pSorry, error opening the file! Click mouse to exit..."); beepWait (); CloseWindow (info_window); RestoreA4(); return; } give_msg ("\pBeginning to build index subfiles ╤ please be patient..."); pass_number = 0; while (build_indices (zbufsiz, doc_file, vRef0, pass_number)) ++pass_number; if (pass_number == 0) give_msg ("\pNo data to index!"); else { if (pass_number > 1) give_msg ("\pBeginning merge generation #1 ╤ please be patient..."); file_number = 0; generation_number = 0; do { prev_gen_number = generation_number; generation_number = merge_indices (zbufsiz, doc_file, vRef0, file_number, generation_number, doc_filename); if (generation_number == prev_gen_number) file_number += NMERGE; else file_number = 0; } while (generation_number >= 0); give_msg ("\pIndexing completed!"); } FSClose (doc_file); give_msg ("\pClick mouse to exit..."); beepWait (); CloseWindow (info_window); RestoreA4(); return; } /* this routine, adapted from the Lightspeed C 'MiniEdit' example, does * the standard files dialog routine to get the name of the file to be indexed * for the database; other file names are gotten from that by putting * a '.k' or a '.p' at the end. This routine returns 1 if all is well, * or 0 if the user cancels out... * * note that the file type is not used here, though a parameter has to be * passed (I think) to SFGetFile().... */ int zGetFile (fileNamePtr, volRefNumPtr) Str255 *fileNamePtr; int *volRefNumPtr; { SFTypeList myFileTypes; Point SFGwhere; SFReply myReply; register int i; SFGwhere.v = 90; SFGwhere.h = 82; myFileTypes[0] = 'TEXT'; /* this isn't actually used */ SFGetFile (SFGwhere, "\p", 0L, -1, myFileTypes, 0L, &myReply); if (myReply.good) { for (i = *myReply.fName; i >= 0; --i) (*fileNamePtr)[i] = myReply.fName[i]; *volRefNumPtr = myReply.vRefNum; return (1); } else return (0); } /* function to set the return value of the XFCN to a chosen error msg; * if there isn't enough free memory to give us a Handle to the msg, * beep a bit more and then return! */ void returnErrorMsg (paramPtr, msg) XCmdBlockPtr paramPtr; char *msg; { Handle answer; int msgLength; SysBeep (10); msgLength = strlen (msg); if ((answer = NewHandle (1 + msgLength)) == NULL) { SysBeep (10); SysBeep (10); return; } strcpy (*answer, msg); paramPtr->returnValue = answer; return; } /* tiny routine to put a message into my message window.... takes in a * PASCAL type string and displays it decently, scrolling text up.... */ void give_msg (msg) char *msg; { Rect b_rect; RgnHandle theRgn; b_rect.top = b_rect.left = 0; b_rect.bottom = 302; b_rect.right = 488; theRgn = NewRgn (); ScrollRect (&b_rect, 0, -15, theRgn); DisposeRgn (theRgn); MoveTo (4, 295); DrawString (msg); return; } /* function to beep and then wait for a mouse click before returning... * delay for 2 ticks to debounce the mouse button a bit .... flush * events to avoid spurious clicks hanging around.... */ void beepWait () { long tickcount; SysBeep (10); while (! Button()) ; Delay (2L, &tickcount); while (Button()) ; Delay (2L, &tickcount); FlushEvents (everyEvent, 0); return; } /* function to convert alphabetic string to a long integer ... */ long atol (s) register char *s; { register char signflag = 0; register long r = 0; while ((*s == ' ')) s++; if (*s == '-') { signflag = 1; s++; } else if (*s == '+') s++; while (*s >= '0' && *s <= '9') r = r * 10 + (*s++ - '0'); return (signflag ? -r : r); } /* function to convert a number into a string and put it into the chosen * target place ... returns the number of characters stored ... * based on K&R p. 60 example of itoa().... * * ---> note, there is NO TERMINAL '\0' in the string that is built!!!! <--- */ int putNum (ans, num) register char *ans; register long num; { register int i, j; int s, result; i = 0; s = 1; if (num < 0) { num = -num; s = -1; } do ans[i++] = num % 10 + '0'; while ((num /= 10) > 0); if (s < 0) ans[i++] = '-'; result = i; for (--i, j = 0; j < i; ++j, --i) { s = ans[i]; ans[i] = ans[j]; ans[j] = s; } return (result); } /* function to determine the length of a string ... standard thing, * adapted from K&R p.98 .... */ int strlen (s) register char *s; { char *s0 = s; while (*s++) ; return (s - s0 - 1); } /* function to copy a string from one place to another, in a rather * obvious fashion ... adapted from K&R p.101 .... */ char *strcpy (s1, s2) register char *s1, *s2; { char *s = s1; while (*s1++ = *s2++) ; return (s); } /* -------------------buffer-related functions-----------------*/ /* function to create a zbuffer and set its parameters appropriately */ void create_zbuffer (n, bufsize, buffile, bufitemsize, zbuffer) int n, bufitemsize; long bufsize; int buffile; ZBUF zbuffer[]; { zbuffer[n].zbufbase = make_buf (bufsize); zbuffer[n].zbufp = zbuffer[n].zbufbase; zbuffer[n].zbufcounter = 0; zbuffer[n].zbufsize = bufsize; zbuffer[n].zbuffilep = buffile; zbuffer[n].zbufitemsize = bufitemsize; } /* function to return a pointer to the next item in a chosen input * buffer 'n'; it reloads the buffer if necessary ... returns NULL * pointer when there's nothing left in the file ... */ char *next_input_item (n, zbuffer) int n; ZBUF zbuffer[]; { char *result; if (zbuffer[n].zbufcounter == 0) load_zbuffer (n, zbuffer); zbuffer[n].zbufcounter -= zbuffer[n].zbufitemsize; if (zbuffer[n].zbufcounter >= 0) { result = zbuffer[n].zbufp; zbuffer[n].zbufp += zbuffer[n].zbufitemsize; return (result); } else return (NULL); } /* function to load the nth zbuffer appropriately ... it resets the buffer * pointers, etc. ... */ void load_zbuffer (n, zbuffer) int n; ZBUF zbuffer[]; { long nread; OSErr err; nread = zbuffer[n].zbufsize; err = FSRead (zbuffer[n].zbuffilep, &nread, zbuffer[n].zbufbase); zbuffer[n].zbufp = zbuffer[n].zbufbase; zbuffer[n].zbufcounter = nread; if (err != noErr && err != eofErr) { give_msg ("\pSorry, fatal error loading buffer -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } return; } /* function to return a pointer to the right place to store the next * output item for the nth zbuffer ... when the buffer becomes full, * it flushes it to disk, resets pointers, etc. */ char *next_output_item (n, zbuffer) int n; ZBUF zbuffer[]; { char *result; if (zbuffer[n].zbufcounter == zbuffer[n].zbufsize) flush_zbuffer (n, zbuffer); result = zbuffer[n].zbufp; zbuffer[n].zbufp += zbuffer[n].zbufitemsize; zbuffer[n].zbufcounter += zbuffer[n].zbufitemsize; return (result); } /* function to flush out a zbuffer to the disk ... */ void flush_zbuffer (n, zbuffer) int n; ZBUF zbuffer[]; { long chars_written; if (zbuffer[n].zbufcounter == 0) return; chars_written = zbuffer[n].zbufcounter; if (FSWrite (zbuffer[n].zbuffilep, &chars_written, zbuffer[n].zbufbase) != noErr || chars_written == 0) { give_msg ("\pSorry, fatal error flushing buffer -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } zbuffer[n].zbufp = zbuffer[n].zbufbase; zbuffer[n].zbufcounter = 0; } /* ------------------index-building functions------------------*/ int build_indices (zbufsiz, doc_file, vRef0, pass_number) long zbufsiz; int doc_file, vRef0, pass_number; { long doc_bufsiz, offset, nwords, ndistinctwords; char *doc, **ptr, info[128]; int i; doc_bufsiz = 2 * NMERGE * zbufsiz / 3 - KEY_LENGTH; doc = make_buf (doc_bufsiz + KEY_LENGTH); ptr = (char **)make_buf (doc_bufsiz * 2); offset = zftell (doc_file); nwords = load_doc_buffer (doc, doc_bufsiz, ptr, doc_file); if (nwords == 0) { DisposPtr (doc); DisposPtr (ptr); return (FALSE); } zqsort (ptr, ptr + nwords); ndistinctwords = write_sorted_files (doc, ptr, nwords, offset, zbufsiz, pass_number, vRef0); DisposPtr (doc); DisposPtr (ptr); strncpy (info + 1, "Index subfile #", 15); i = 16 + putNum (info + 16, pass_number + 1); strncpy (info + i, ": ", 3); i += 3; i += putNum (info + i, nwords); strncpy (info + i, " total words, ", 14); i += 14; i += putNum (info + i, ndistinctwords); strncpy (info + i, " distinct words.", 16); i += 16; info[0] = i - 1; give_msg (info); return (TRUE); } /* ---------functions to load text from document file filter it-------- */ /* function to create a buffer for me to use... */ char *make_buf (bufsiz) long bufsiz; { char *buf; buf = NewPtr (bufsiz); if (buf == 0) { give_msg ("\pSorry, fatal error creating buffer -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } return (buf); } /* function to load the document buffer ... bring in doc_bufsiz * characters, and then enough more to finish out the final word, * followed by a terminal delimiter .... as the characters are scanned * filter them appropriately (change delimiters to '\0's) and * build the pointer array in memory to the first character of each * word ... return the total number of words that were * read in to the buffer (zero if we're finished with the file) * * ... note that one must be sure to pull in and throw away * any excess characters beyond KEY_LENGTH in the final word, so that * the remaining fragment doesn't show up as the first "word" in the * next chunk of the file.... * * For maximum speed and efficiency, use a hybrid approach: do a * big buffer-sized chunk of the file first, then take one character at * a time to finish off.... */ long load_doc_buffer (doc, doc_bufsiz, ptr, doc_file) register char *doc, **ptr; long doc_bufsiz; int doc_file; { int i, err; register int c, in_a_word = FALSE; char **ptr0, *end_doc_buf; ptr0 = ptr; err = FSRead (doc_file, &doc_bufsiz, doc); if (err != noErr && err != eofErr) { give_msg ("\pSorry, fatal error reading text -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } end_doc_buf = doc + doc_bufsiz; while (doc < end_doc_buf) { c = filter_table[*(unsigned char *)doc]; if (c == '\0') in_a_word = FALSE; else if (! in_a_word) { *ptr++ = doc; in_a_word = TRUE; } *doc++ = c; } if (err != eofErr && doc == end_doc_buf && in_a_word) { for (i = 0; i < KEY_LENGTH; ++i) { c = zgetc (doc_file); if (c == EOF) { *doc++ = '\0'; break; } c = filter_table[c]; if (c == '\0') { *doc++ = '\0'; break; } *doc++ = c; } if (i == KEY_LENGTH) while ((c = zgetc (doc_file)) != EOF && filter_table[c]) ; } return (ptr - ptr0); } /* my function to replace getc() ... give up and call it EOF on any error * at all, at this point! ... note also that we must return the UNSIGNED * character read in, to handle Mac special characters properly.... */ int zgetc (refNum) int refNum; { unsigned char readbuf[1]; long count = 1; if (FSRead (refNum, &count, readbuf) == noErr) return (readbuf[0]); else return (EOF); } /* function to copy a string of length up to n from s2 to s1.... */ char *strncpy (s1, s2, n) register char *s1, *s2; register int n; { char *s0 = s1; if (n > 0) { while (n-- && (*s1++ = *s2++)); while (n > 0) { /* pad out to n chars -- H&S specs it this way... */ *s1++ = '\0'; n--; } } return (s0); } /* ------functions to merge key and ptr files-------------------- */ /* function to do the real work of merging sorted k & p * files into a single sorted file... * * Procedure is as follows: * * Compare the current key records from each of the N files to be merged. * Take the smallest of the keys (or, if there is a tie, take the one * from the earlier file number) and write its pointer records out to * the *.p file for the next generation. If the key is a new one, that * is, if it differs from the previous key, write out the previous key * word and the value for cumulative_counts to the *.k file, and reset * the previous key's value to that of the current key. Then repeat * this whole procedure, until all the input files are exhausted. * * The above works provided that we are careful in choosing the smallest * key and never let a file that has been exhausted (reached EOF) win a * comparison. The function smallest_key does that properly below, since * a file that is at EOF has returned a NULL pointer for its key_rec... * * For each file, the variable prev_cc[i] holds the previous value of * cumulative_counts for that file, so that we can determine how * many ptr_recs to write out by doing a simple subtraction. * * Buffer numbering scheme: the Kth input file has zbuffer #K * for its keys and zbuffer #(K+n) for its pointers; the output * buffers are zbuffers #(2*n) for keys and #(2*n+1) for pointers. */ void nway_merge_kpfiles (ink, inp, outk, outp, n, zbufsiz) int ink[], inp[], outk, outp, n; long zbufsiz; { int i; KEY_REC *kr[NMERGE], prev_key; long prev_cc[NMERGE], out_cc; ZBUF zbuffer[NMERGE * 2 + 2]; for (i = 0; i < n; ++i) prev_cc[i] = 0; out_cc = 0; for (i = 0; i < n; ++i) { create_zbuffer (i, zbufsiz, ink[i], sizeof(KEY_REC), zbuffer); create_zbuffer (i + n, zbufsiz, inp[i], sizeof(long), zbuffer); } create_zbuffer (2 * n, zbufsiz, outk, sizeof(KEY_REC), zbuffer); create_zbuffer (2 * n + 1, zbufsiz, outp, sizeof(long), zbuffer); for (i = 0; i < n; ++i) kr[i] = (KEY_REC *)next_input_item (i, zbuffer); i = smallest_key (kr, n); strncpy (prev_key.kkey, kr[i]->kkey, KEY_LENGTH); while (merge_not_finished (kr, n)) { i = smallest_key (kr, n); if (zstrcmp (prev_key.kkey, kr[i]->kkey)) { copy_key_rec (prev_key.kkey, out_cc, 2 * n, zbuffer); strncpy (prev_key.kkey, kr[i]->kkey, KEY_LENGTH); } copy_ptr_recs (i + n, kr[i]->ccount - prev_cc[i], 2 * n + 1, zbuffer); out_cc += kr[i]->ccount - prev_cc[i]; prev_cc[i] = kr[i]->ccount; kr[i] = (KEY_REC *)next_input_item (i, zbuffer); } copy_key_rec (prev_key.kkey, out_cc, 2 * n, zbuffer); flush_zbuffer (2 * n, zbuffer); flush_zbuffer (2 * n + 1, zbuffer); for (i = 0; i < 2 * n + 2; ++i) DisposPtr (zbuffer[i].zbufbase); } /* function to copy a chosen number of ptr_recs (longs) from one file * to another ... used in merging two files by merge_kpfiles() above.... */ void copy_ptr_recs (inzbuf, count, outzbuf, zbuffer) int inzbuf, outzbuf; long count; ZBUF zbuffer[]; { long *inp, *outp; for ( ; count > 0; --count) { inp = (long *)next_input_item (inzbuf, zbuffer); outp = (long *)next_output_item (outzbuf, zbuffer); *outp = *inp; } } /* function to write a key record including kkey (key word) and ccount * (cumulative_count) to an output file... */ void copy_key_rec (kkey, cc, outzbuf, zbuffer) char *kkey; long cc; int outzbuf; ZBUF zbuffer[]; { KEY_REC *outp; outp = (KEY_REC *)next_output_item (outzbuf, zbuffer); strncpy (outp->kkey, kkey, KEY_LENGTH); outp->ccount = cc; } /* simple function to see if we are not yet finished with all of the input * files coming in to the merge operation ... signified by at least one of * the key pointers being non-NULL.... */ int merge_not_finished (kr, n) KEY_REC *kr[]; register int n; { register int i; for (i = 0; i < n; ++i) if (kr[i] != NULL) return (TRUE); return (FALSE); } /* function to determine the smallest of the n keys pointed to by the * kr[] vector of pointers to KEY_RECs ... note that a NULL ptr ranks * after any other...also note that in case of a tie, the smaller * value of i is the one to return (for a stable merging sort) */ smallest_key (kr, n) KEY_REC *kr[]; register int n; { register int i, smallest; for (i = 0; i < n; ++i) if (kr[i] != NULL) { smallest = i; break; } for (i = smallest + 1; i < n; ++i) if (kr[i] != NULL) if (zstrcmp (kr[smallest]->kkey, kr[i]->kkey) > 0) smallest = i; return (smallest); } /* ---------------function to merge index files------------- */ /* function to merge sorted indices together repeatedly until finished * with them all in a single set of *.k/*.p files ... * * The merging strategy is straightforward enough: * Let "g" denote the generation_number and "f" denote the file_number. * Temporary file names begin with the letter z, which is then followed * by a generation number (decimal), the letter k or p (standing for * key or ptr, respectively), and then the file number (decimal). Thus, * the file "z0k0" is the keys file #0 for generation #0 (the starting, * pre-merging, generation), file "z2p3" is the ptr file #3 for generation * #2, etc. * * (The following discussion is specifically for a 2-way merge ... but * the generalization for N-way merging is straightforward.) * * On a given call to merge_indices, the following may happen: * - files zgkf/zgpf and zgk(f+1)/zgp(f+1) are merged into files * z(g+1)k(f/2)/z(g+1)p(f/2), and then the parent files are * deleted; * - file zgkf isn't found, which means we are done with this * generation and must go on to the next; * - file zgk(f+1) isn't found, which means that either we are * completely done with the merging work (if f=0) and just * have to rename the files zgkf/zgpf into the correct final * names (that is, doc_filename.k/doc_filename.p), or else * (if f>0) we have an odd number of files for this level * of merging, and therefore just have to rename zgkf/zgpf * to z(g+1)k(f/2)/z(g+1)p(f/2). * * to work as a nice XFCN, this function returns the upcoming * generation_number, and lets the caller reset the file_number to zero * when the generation_number returned differs from the input number, or * increment the file_number by NMERGE when still on same generation. * Also, when all finished, we return an illegal (negative) generation_number * value.... */ int merge_indices (zbufsiz, doc_file, vRef0, file_number, generation_number, doc_filename) long zbufsiz; int doc_file, vRef0, file_number, generation_number; Str255 doc_filename; { int ink[NMERGE], inp[NMERGE], outk, outp; long inwords, indistinctwords, outdistinctwords; int i, n; char info[128]; for (n = 0; n < NMERGE; ++n) { ink[n] = open_inkfile (file_number + n, vRef0, generation_number); if (ink[n] == NULL) break; inp[n] = open_inpfile (file_number + n, vRef0, generation_number); } if (file_number + n == 1) { close_infiles (ink, inp, n); fix_final_file_names (doc_filename, vRef0, generation_number); return (-1); } if (n < 2) { if (n == 1) { close_infiles (ink, inp, n); fix_oddball_file_name (vRef0, generation_number, file_number); } strncpy (info + 1, "Beginning merge generation #", 28); i = 29 + putNum (info + 29, 2 + generation_number); info[0] = i - 1; give_msg (info); return (generation_number + 1); } outk = open_outkfile (vRef0, generation_number, file_number); outp = open_outpfile (vRef0, generation_number, file_number); inwords = 0; indistinctwords = 0; for (i = 0; i < n; ++i) { inwords += file_size (inp[i]) / sizeof(long); indistinctwords += file_size (ink[i]) / sizeof(KEY_REC); } nway_merge_kpfiles (ink, inp, outk, outp, n, zbufsiz); outdistinctwords = file_size (outk) / sizeof(KEY_REC); strncpy (info + 1, "Merge #", 7); i = 8 + putNum (info + 8, 1 + file_number / NMERGE); strncpy (info + i, ": ", 3); i += 3; i += putNum (info + i, inwords); strncpy (info + i, " total words, ", 14); i += 14; i += putNum (info + i, indistinctwords); strncpy (info + i, " distinct words in, ", 20); i += 20; i += putNum (info + i, outdistinctwords); strncpy (info + i, " out.", 5); i += 5; info[0] = i - 1; give_msg (info); close_infiles (ink, inp, n); FSClose (outk); FSClose (outp); remove_used_infiles (n, vRef0, generation_number, file_number); return (generation_number); } /* ------------file utility functions---------------------------- */ /* function to write out sorted k & p files based on the doc and ptr * arrays in memory.... * * The kfile format is as described in detail elsewhere: * the key word, turned into all capital letters and with spaces * afterward, of fixed length KEY_LENGTH; and * the cumulative count of how many words have passed before, including * the current word, a long integer. */ long write_sorted_files (doc, ptr, nwords, offset, zbufsiz, pass_number, vRef0) char *doc, **ptr; long nwords, offset, zbufsiz; int pass_number, vRef0; { int kfile, pfile; char *prev_word; KEY_REC *outk; long *outp, i; ZBUF zbuffer[2]; if (nwords == 0) return; kfile = open_kfile (vRef0, pass_number); pfile = open_pfile (vRef0, pass_number); create_zbuffer (0, zbufsiz, kfile, sizeof(KEY_REC), zbuffer); create_zbuffer (1, zbufsiz, pfile, sizeof(long), zbuffer); prev_word = ptr[0]; outk = (KEY_REC *)next_output_item (0, zbuffer); write_new_key (ptr[0], outk->kkey); for (i = 0; i < nwords; ++i) { if (is_new_word (prev_word, ptr[i])) { outk->ccount = i; outk = (KEY_REC *)next_output_item (0, zbuffer); write_new_key (ptr[i], outk->kkey); prev_word = ptr[i]; } outp = (long *)next_output_item (1, zbuffer); *outp = (ptr[i] - doc) + offset; } outk->ccount = i; flush_zbuffer (0, zbuffer); flush_zbuffer (1, zbuffer); DisposPtr (zbuffer[0].zbufbase); DisposPtr (zbuffer[1].zbufbase); i = file_size (kfile) / sizeof(KEY_REC); FSClose (kfile); FSClose (pfile); return (i); } /* function to determine if the current word is the same as or different * from the previous word -- if it is different, we'll need to write an * entry out to the key file kfile -- compare the words up to the first * '\0', or for a maximum distance of KEY_LENGTH, and return TRUE * if they differ, FALSE if they are identical that far. Thus, a simple * call to zstrcmp() does the job.... but keep ours as a function instead * of a macro call for the moment, for safety and readability.... */ int is_new_word (w0, w1) char *w0, *w1; { return (zstrcmp (w0, w1)); } /* function to write out a new key entry in the key_file: * KEY_LENGTH letters consisting of the key word (which will be found * delimited by a '\0'), followed by enough blanks to fill out the * record to total length KEY_LENGTH ... */ void write_new_key (p, kp) register char *p, *kp; { register int i, c; for (i = 0; i < KEY_LENGTH; ++i) { c = *p++; if (c == '\0') break; *kp++ = c; } for ( ; i < KEY_LENGTH; ++i) *kp++ = ' '; return; } /* a very simple function to return the size of a file ... doesn't do any * error-checking -- sorry! */ long file_size (refnum) int refnum; { long result; GetEOF (refnum, &result); return (result); } /* ---------------miscellaneous useful functions----------------- */ /* function to open an input key file for this generation and file number ... * note the limitation to no more than 9 generations, which with a 4-way * merge should be more than enough for a few years to come! Note that * this function should return 0 and not give up if it attempts to open * a nonexistent file -- that's the signal for merge_indices() to know * that all of the input files have been used up.... */ int open_inkfile (file_num, vRef0, generation_number) int file_num, vRef0, generation_number; { Str255 fname; int i; fname[1] = 'z'; fname[2] = generation_number + '0'; fname[3] = 'k'; i = putNum ((char *)fname + 4, file_num); fname[0] = i + 3; if (FSOpen (fname, vRef0, &i) != noErr) return (NULL); return (i); } /* function to open an input ptr file for this generation and file number */ int open_inpfile (file_num, vRef0, generation_number) int file_num, vRef0, generation_number; { Str255 fname; int i; fname[1] = 'z'; fname[2] = generation_number + '0'; fname[3] = 'p'; i = putNum ((char *)fname + 4, file_num); fname[0] = i + 3; if (FSOpen (fname, vRef0, &i) != noErr) return (NULL); return (i); } /* function to open an output key file for this generation and file number... * use my Official Apple-Approved File Type 'CTLZ' (for control-Z = ^Z)... */ int open_outkfile (vRef0, generation_number, file_number) int vRef0, generation_number, file_number; { Str255 fname; int i; fname[1] = 'z'; fname[2] = generation_number + 1 + '0'; fname[3] = 'k'; i = putNum ((char *)fname + 4, file_number / NMERGE); fname[0] = i + 3; if (Create (fname, vRef0, '????', 'CTLZ') != noErr) { give_msg ("\pSorry, fatal error creating merge key file -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } if (FSOpen (fname, vRef0, &i) != noErr) { give_msg ("\pSorry, fatal error opening merge key file -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } return (i); } /* function to open an output ptr file for this generation and file number * again, use 'CTLZ' file type... */ int open_outpfile (vRef0, generation_number, file_number) int vRef0, generation_number, file_number; { Str255 fname; int i; fname[1] = 'z'; fname[2] = generation_number + 1 + '0'; fname[3] = 'p'; i = putNum ((char *)fname + 4, file_number / NMERGE); fname[0] = i + 3; if (Create (fname, vRef0, '????', 'CTLZ') != noErr) { give_msg ("\pSorry, fatal error creating merge ptr file -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } if (FSOpen (fname, vRef0, &i) != noErr) { give_msg ("\pSorry, fatal error opening merge ptr file -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } return (i); } /* function to rename the remaining last unpaired key file & ptr file * from one generation to the next... */ void fix_oddball_file_name (vRef0, generation_number, file_number) int vRef0, generation_number, file_number; { Str255 oldname, newname; int i; oldname[1] = 'z'; oldname[2] = generation_number + '0'; oldname[3] = 'k'; i = putNum ((char *)oldname + 4, file_number); oldname[0] = i + 3; newname[1] = 'z'; newname[2] = generation_number + 1 + '0'; newname[3] = 'k'; i = putNum ((char *)newname + 4, file_number / NMERGE); newname[0] = i + 3; if (Rename (oldname, vRef0, newname) != noErr) { give_msg ("\pSorry, error renaming merge key file ... click mouse to continue"); beepWait (); } oldname[1] = 'z'; oldname[2] = generation_number + '0'; oldname[3] = 'p'; i = putNum ((char *)oldname + 4, file_number); oldname[0] = i + 3; newname[1] = 'z'; newname[2] = generation_number + 1 + '0'; newname[3] = 'p'; i = putNum ((char *)newname + 4, file_number / NMERGE); newname[0] = i + 3; if (Rename (oldname, vRef0, newname) != noErr) { give_msg ("\pSorry, error renaming merge ptr file ... click mouse to continue"); beepWait (); } return; } /* function to give the final key and ptr files their proper ultimate * names ... ok for it to mess with doc_filename string.... */ void fix_final_file_names (doc_filename, vRef0, generation_number) Str255 doc_filename; int vRef0, generation_number; { Str255 oldname; oldname[0] = 4; oldname[1] = 'z'; oldname[2] = generation_number + '0'; oldname[3] = 'k'; oldname[4] = '0'; doc_filename[++doc_filename[0]] = '.'; doc_filename[++doc_filename[0]] = 'k'; if (Rename (oldname, vRef0, doc_filename) != noErr) { give_msg ("\pSorry, error renaming final key file ... click mouse to continue"); beepWait (); } oldname[3] = 'p'; doc_filename[doc_filename[0]] = 'p'; if (Rename (oldname, vRef0, doc_filename) != noErr) { give_msg ("\pSorry, error renaming final ptr file ... click mouse to continue"); beepWait (); } return; } /* function to get rid of the superfluous k & p files now that they * have been merged into the next generation.... */ void remove_used_infiles (n, vRef0, generation_number, file_number) int n, vRef0, generation_number, file_number; { Str255 fname; int i, j; fname[1] = 'z'; fname[2] = generation_number + '0'; for (i = 0; i < n; ++i) { fname[3] = 'k'; j = putNum ((char *)fname + 4, file_number + i); fname[0] = j + 3; if (FSDelete (fname, vRef0) != noErr) { give_msg ("\pSorry, error deleting merge key file ... click mouse to continue"); beepWait (); } fname[3] = 'p'; if (FSDelete (fname, vRef0) != noErr) { give_msg ("\pSorry, error deleting merge ptr file ... click mouse to continue"); beepWait (); } } return; } /* function to close out the ink and inp files that have been opened... */ void close_infiles (ink, inp, n) int ink[], inp[]; int n; { int i; for (i = 0; i < n; ++i) { FSClose (ink[i]); FSClose (inp[i]); } } /* function to open the key file with proper name for this pass ... file will be * named "z0kN" where N represents the pass number .... type 'CTLZ' */ int open_kfile (vRef0, pass_number) int vRef0, pass_number; { Str255 fname; int i; fname[1] = 'z'; fname[2] = '0'; fname[3] = 'k'; i = putNum ((char *)fname + 4, pass_number); fname[0] = i + 3; if (Create (fname, vRef0, '????', 'CTLZ') != noErr) { give_msg ("\pSorry, fatal error creating key file -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } if (FSOpen (fname, vRef0, &i) != noErr) { give_msg ("\pSorry, fatal error opening key file -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } return (i); } /* function to open the ptr file with proper name for this pass ... file will be * named "z0pN" where N represents the pass number .... type 'CTLZ' */ int open_pfile (vRef0, pass_number) int vRef0, pass_number; { Str255 fname; int i; fname[1] = 'z'; fname[2] = '0'; fname[3] = 'p'; i = putNum ((char *)fname + 4, pass_number); fname[0] = i + 3; if (Create (&fname, vRef0, '????', 'CTLZ') != noErr) { give_msg ("\pSorry, fatal error creating ptr file -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } if (FSOpen (&fname, vRef0, &i) != noErr) { give_msg ("\pSorry, fatal error opening ptr file -- click mouse to exit, then reboot!"); beepWait (); RestoreA4(); ExitToShell (); } return (i); } /* function to take the place of stdio's ftell() */ long zftell (refnum) int refnum; { long ans; GetFPos (refnum, &ans); return (ans); } /* --------------function to do quicksort -------------------------- */ /* sort elements "first" through "last" */ void zqsort (first, last) register char **first, **last; { register char **i, **j, *tmp; while (last - first > 1) { i = first; j = last; for (;;) { while (++i < last && compare_ptrs (i, first) < 0) ; while (--j > first && compare_ptrs (j, first) > 0) ; if (i >= j) break; tmp = *i; *i = *j; *j = tmp; } tmp = *first; *first = *j; *j = tmp; if (j - first < last - (j + 1)) { zqsort (first, j); first = j + 1; } else { zqsort (j + 1, last); last = j; } } } /* function to compare two index ptrs and give a result appropriate * for quicksort to use in alphabetizing them.... * * Since the words pointed to have already been turned into all capital * letters and delimiters have been filtered out, simply doing zstrcmp() * for KEY_LENGTH letters works fine! * * Slight modification to make the quicksort stable: if two words tie, * then we want to compare their pointers to make the lesser one come * out first in the sort ... */ int compare_ptrs (p1, p2) register char **p1, **p2; { register int diff; diff = zstrcmp (*p1, *p2); if (diff == 0) diff = ((*p1 - *p2) > 0) ? 1 : -1; return (diff); } /* my function to compare two strings and give a result as to who is * alphabetically earlier. Note that this is almost the same as strncmp() * with the fixed value of KEY_LENGTH as the maximum comparison distance, * except that I must be sure to mask the characters to make them positive * (since we want to be able to handle the non-ASCII funny letters in * the Apple character set properly/consistently). If the masking isn't * done, then inconsistent results can occur with those non-ASCII chars! */ int zstrcmp (s1, s2) register char *s1, *s2; { register int n = KEY_LENGTH; for (; --n && ((*s1 & 0xFF) == (*s2 & 0xFF)); s1++, s2++) if (!*s1) break; return ((*s1 & 0xFF) - (*s2 & 0xFF)); }